# Add code here to load any required libraries with `library()`.
# We suggest you use `install.package()` for any required packages externally to this document
# since installation only need be done once.
#install.packages("ggplot2")
#install.packages("ggrepel")
#install.packages("tidyverse")
#install.packages("validate")
#install.packages('caret')
library(ggplot2) # for plotting
library(ggrepel) # for plot labels in pie chart
library(tidyverse) # for mutating data
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(modeest) # for calculating mode
## Registered S3 method overwritten by 'rmutil':
## method from
## print.response httr
library(validate) # for validator object
##
## Attaching package: 'validate'
##
## The following object is masked from 'package:dplyr':
##
## expr
##
## The following object is masked from 'package:ggplot2':
##
## expr
library(car) # for QQ plot method and vif
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(moments) # for skewness test
##
## Attaching package: 'moments'
##
## The following object is masked from 'package:modeest':
##
## skewness
library(caret) # for confusion matrix
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Assign your student id into the variable SID, for example:
SID <- 2246041 # This is an example, replace 2101234 with your actual ID
SIDoffset <- (SID %% 100) + 1 # Your SID mod 100 + 1
load("house-analysis.RDa")
# Now subset the housing data set
# Pick every 100th observation starting from your offset
# Put into your data frame named mydf (you can rename it) -> renamed to housedf
housedf <- house.analysis[seq(from=SIDoffset,to=nrow(house.analysis),by=100),]
To check for data quality, we would follow the following steps:
We’ll use head() or View() to get first glance at
the data. This will also help us understand the variables and set up
rules for the validator object.
We will count the rows with missing values, NAs, and blanks using
is.na() and colSums(). This step is additional as
summary() already shows NA values if any.
# Custom function for plotting used towards data quality analysis
#Defining functions for plotting via ggplot
#Custom Pie chart using ggplot
custom_pie <- function(var, title="", xlab, ylab){
if(title==""){
title=paste("Pie Chart for",xlab)
}
df <- as.data.frame(table(var))
df2 <- df %>%
mutate(csum = rev(cumsum(rev(Freq))),
pos = Freq/2 + lead(csum, 1),
pos = if_else(is.na(pos), Freq/2, pos))
ggplot(df, aes(x = "" , y = Freq, fill = var)) + geom_col(width = 1, color=alpha("white",alpha = 0.3)) +
coord_polar(theta = "y") +
geom_label_repel(data = df2,
aes(y = pos, label = paste0(round(100*Freq/sum(Freq),2), "%")),
size = 3, nudge_x = 1, show.legend = FALSE, color=alpha("#DDDDDD", alpha = 1)) +
guides(fill = guide_legend(title = xlab)) + ggtitle(title) +
theme_void()
}
#Custom Bar chart using ggplot
custom_bar <- function(var, title="", xlab, ylab){
if(title==""){
title=paste("Bar Graph for",xlab)
}
df <- as.data.frame(table(var))
ggplot(df, aes(x=var, y=Freq, fill=var)) +
geom_bar(width = 1, stat = "identity", color=alpha("white",alpha = 0.3)) +
guides(fill=guide_legend(title=xlab)) +
ggtitle(title) + xlab(xlab) + ylab(ylab) +
theme_minimal() +
geom_text(aes(x = var, y = Freq + max(Freq)/50, label = Freq), size=3)
}
#Custom Box Plot using ggplot
custom_box <- function(var, title="", fill="#bfe9ff", xlab, ylab){
if(title==""){
title=paste("Box Plot for",xlab)
}
ggplot(data = housedf, aes(x="",y=var)) +
geom_boxplot(fill=fill) +
ggtitle(title) + xlab(xlab) + ylab(ylab) +
theme_minimal()
}
#Custom Histogram using ggplot
custom_hist <- function(data, var, title="", fill="#ff6e7f", xlab, ylab, binwidth=1){
if(title=="") title=paste("Histogram for",xlab,"vs",ylab)
ggplot(data = data, aes(x=var)) +
geom_histogram(binwidth=binwidth, fill=fill,color=alpha("white",alpha = 0.3)) +
ggtitle(title) + xlab(xlab) + ylab(ylab) +
theme_minimal() +
stat_bin(aes(y=..count.., label=..count..), geom="text", vjust=-.5, binwidth = binwidth, size= 3)
}
# Checking dimensions of the data
dim(housedf)
## [1] 904 12
# Getting the variable names
names(housedf)
## [1] "id" "price" "mq"
## [4] "floor" "n_rooms" "n_bathrooms"
## [7] "has_terrace" "has_alarm" "heating"
## [10] "has_air_conditioning" "has_parking" "is_furnished"
# Getting the structure and checking the data type
str(housedf)
## 'data.frame': 904 obs. of 12 variables:
## $ id : int 84 274 477 669 943 1154 1368 1676 1970 2236 ...
## $ price : num 135000 38000 45000 189000 40000 38000 115000 12000 127000 44000 ...
## $ mq : num 150 80 110 120 45 90 87 80 55 66 ...
## $ floor : num 1 2 1 1 1 1 2 1 2 1 ...
## $ n_rooms : num 3 5 4 4 2 4 3 3 3 2 ...
## $ n_bathrooms : num 1 2 1 2 1 2 2 2 2 1 ...
## $ has_terrace : int 0 0 0 0 0 0 1 0 1 0 ...
## $ has_alarm : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heating : chr "autonomous" "autonomous" "autonomous" "autonomous" ...
## $ has_air_conditioning: int 1 0 0 0 1 0 0 0 1 0 ...
## $ has_parking : int 0 0 0 0 1 0 0 0 0 0 ...
## $ is_furnished : int 1 0 0 0 0 0 0 0 0 0 ...
# Displaying the numerical summary of the dataframe
summary(housedf)
## id price mq floor
## Min. : 84 Min. : 1000 Min. : 0.00 Min. :1.000
## 1st Qu.: 53956 1st Qu.: 79000 1st Qu.: 77.75 1st Qu.:1.000
## Median :113201 Median :125000 Median :100.00 Median :1.000
## Mean :111106 Mean :144084 Mean :115.78 Mean :1.762
## 3rd Qu.:167356 3rd Qu.:187250 3rd Qu.:135.00 3rd Qu.:2.000
## Max. :223309 Max. :500000 Max. :840.00 Max. :7.000
## n_rooms n_bathrooms has_terrace has_alarm
## Min. :-1.000 Min. :1.000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :1.000 Median :0.0000 Median :0.00000
## Mean : 3.485 Mean :1.425 Mean :0.1261 Mean :0.01106
## 3rd Qu.: 4.000 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. : 5.000 Max. :3.000 Max. :1.0000 Max. :1.00000
## heating has_air_conditioning has_parking is_furnished
## Length:904 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Mode :character Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.3208 Mean :0.0177 Mean :0.07965
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
# Eyeballing the data
#View(housedf)
head(housedf)
# Get no of Unique values in every column
# Loop through all columns in the dataset
for(col in names(housedf)) {
# Get the unique values in the column
unique_values <- unique(housedf[[col]])
# Get no of unique values
length_unique <- length(unique_values)
# If no of unique values is less, e.g. 20 supposedly
if(length_unique < 20) {
# Print the length and value of unique values if unique values are limited
print(paste(col, "has", length_unique,"unique values; Repeated values are ", toString(unique_values)))
}
else{
# Print the length of unique values in the column if unique values are not limited
print(paste(col, "has", length_unique,"unique values;"))
}
}
## [1] "id has 904 unique values;"
## [1] "price has 230 unique values;"
## [1] "mq has 180 unique values;"
## [1] "floor has 7 unique values; Repeated values are 1, 2, 3, 4, 5, 7, 6"
## [1] "n_rooms has 5 unique values; Repeated values are 3, 5, 4, 2, -1"
## [1] "n_bathrooms has 3 unique values; Repeated values are 1, 2, 3"
## [1] "has_terrace has 2 unique values; Repeated values are 0, 1"
## [1] "has_alarm has 2 unique values; Repeated values are 0, 1"
## [1] "heating has 3 unique values; Repeated values are autonomous, other, autonamous"
## [1] "has_air_conditioning has 2 unique values; Repeated values are 1, 0"
## [1] "has_parking has 2 unique values; Repeated values are 0, 1"
## [1] "is_furnished has 2 unique values; Repeated values are 1, 0"
# From metadata, uniqueness check above, and data summary,
# we can deduce the following:
# id: unique, numeric and non-negative
# price: numeric and non-negative
# mq: numeric and non-negative
# floor: categorical, 1 to 7
# n_rooms: categorical, -1, 2, 3, 4, 5
# n_bathrooms: categorical, 1, 2, 3
# has_terrace: binary, either 0 or 1
# has_alarm: binary, either 0 or 1
# heating: categorical, (either "autonomous" or "other" from the metadata)
# has_air_conditioning: binary, either 0 or 1
# has_parking: binary, either 0 or 1
# is_furnished: binary, either 0 or 1
# Data Quality Check using Validator
# the rules are built based on outputs from summary() and View()
house.rules <- validator(uniqId = is_unique(id),
posPrice = price > 0,
posMq = mq > 0,
posFloor = floor > 0,
posRooms = n_rooms > 0,
posBath = n_bathrooms > 0,
okTerc = is.element(has_terrace, c(0,1)),
okAlrm = is.element(has_alarm, c(0,1)),
okHeat = is.element(heating, c("autonomous","other")),
okAC = is.element(has_air_conditioning, c(0,1)),
okPrk = is.element(has_parking, c(0,1)),
okFur = is.element(is_furnished, c(0,1))
)
housedf.qual.chk <- confront(housedf, house.rules)
summary(housedf.qual.chk)
# Counting NA values
colSums(is.na(housedf))
## id price mq
## 0 0 0
## floor n_rooms n_bathrooms
## 0 0 0
## has_terrace has_alarm heating
## 0 0 0
## has_air_conditioning has_parking is_furnished
## 0 0 0
# Analysing id variable via numerical summary and checking if id is unique
summary(housedf$id)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 84 53956 113201 111106 167356 223309
nrow(housedf)==sum(housedf$id==unique(housedf$id))
## [1] TRUE
# Numerical Summaries, Boxplot and Histogram for numerical variables
# Analysing price
summary(housedf$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1000 79000 125000 144084 187250 500000
custom_box(housedf$price/100000, xlab="price", ylab="in 100k")
custom_hist(data=housedf, var=housedf$price/100000, xlab="price (in 100k)", ylab="Frequency")
# Analysing mq
summary(housedf$mq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 77.75 100.00 115.78 135.00 840.00
custom_box(housedf$mq, xlab="mq", ylab="")
custom_hist(data = housedf, var = housedf$mq, xlab = "mq", ylab = "sq. meters", binwidth = 100)
# Frequency Distribution Table, Piechart and Barplot for Categorical Variables
# Analysing floor
table(housedf$floor)
##
## 1 2 3 4 5 6 7
## 454 294 107 28 13 3 5
custom_pie(var=housedf$floor, title = "Pie Chart for floor distribution", xlab= "floor", ylab ="count")
custom_bar(var=housedf$floor, xlab= "floor", ylab ="count")
# Analysing n_rooms
table(housedf$n_rooms)
##
## -1 2 3 4 5
## 1 134 354 254 161
custom_pie(var=housedf$n_rooms, title = "Pie Chart for n_rooms distribution", xlab= "n_rooms", ylab ="count")
custom_bar(var=housedf$n_rooms, xlab= "n_rooms", ylab ="count")
# Analysing n_bathrooms
table(housedf$n_bathrooms)
##
## 1 2 3
## 556 312 36
custom_pie(var=housedf$n_bathrooms, title = "Pie Chart for n_bathrooms distribution", xlab= "n_bathrooms", ylab ="count")
custom_bar(var=housedf$n_bathrooms, xlab= "n_bathrooms", ylab ="count")
# Analysing has_terrace
table(housedf$has_terrace)
##
## 0 1
## 790 114
custom_pie(var=housedf$has_terrace, title = "Pie Chart for has_terrace distribution", xlab= "has_terrace", ylab ="count")
custom_bar(var=housedf$has_terrace, xlab= "has_terrace", ylab ="count")
# Analysing has_alarm
table(housedf$has_alarm)
##
## 0 1
## 894 10
custom_pie(var=housedf$has_alarm, title = "Pie Chart for has_alarm distribution", xlab= "has_alarm", ylab ="count")
custom_bar(var=housedf$has_alarm, xlab= "has_alarm", ylab ="count")
# Analysing heating
table(housedf$heating)
##
## autonamous autonomous other
## 1 789 114
custom_pie(var=housedf$heating, title = "Pie Chart for heating distribution", xlab= "heating", ylab ="count")
custom_bar(var=housedf$heating, xlab= "heating", ylab ="count")
# Analysing has_air_conditioning
table(housedf$has_air_conditioning)
##
## 0 1
## 614 290
custom_pie(var=housedf$has_air_conditioning, title = "Pie Chart for has_air_conditioning distribution", xlab= "has_air_conditioning", ylab ="count")
custom_bar(var=housedf$has_air_conditioning, xlab= "has_air_conditioning", ylab ="count")
# Analysing has_parking
table(housedf$has_parking)
##
## 0 1
## 888 16
custom_pie(var=housedf$has_parking, title = "Pie Chart for has_parking distribution", xlab= "has_parking", ylab ="count")
custom_bar(var=housedf$has_parking, xlab= "has_parking", ylab ="count")
# Analysing is_furnished
table(housedf$is_furnished)
##
## 0 1
## 832 72
custom_pie(var=housedf$is_furnished, title = "Pie Chart for is_furnished distribution", xlab= "is_furnished", ylab ="count")
custom_bar(var=housedf$is_furnished, xlab= "is_furnished", ylab ="count")
From the data, we have the following observations:
The following data quality issues were identified in the above step:
To clean the data, we’ll copy the existing data into a new dataframe housedf_clean and follow the listed steps in sequence:
# Data Cleaning
# Create new dataframe
housedf_clean <- housedf
# Viewing / correcting mq
housedf_clean[housedf_clean$mq==0,]
similar_set <- housedf[housedf$floor==1 & housedf$n_rooms==3 & housedf$n_bathrooms==2 & housedf$has_terrace==0 &
housedf$has_alarm==0 & housedf$heating=="autonomous" & housedf$has_air_conditioning==0 &
housedf$has_parking==0 & housedf$is_furnished==0,]
cor.test(similar_set$price, similar_set$mq)
##
## Pearson's product-moment correlation
##
## data: similar_set$price and similar_set$mq
## t = -0.46526, df = 21, p-value = 0.6465
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4926966 0.3247136
## sample estimates:
## cor
## -0.1010083
# In case of high correlation, we impute by the mean of mq from similar_set
#housedf_clean$mq[housedf_clean$mq==0] <- round(mean(similar_set$mq))
# In case of low / no correlation, we remove the observation having mq as 0
housedf_clean <- housedf_clean[which(housedf_clean$mq != 0),]
# Correcting n_rooms
# replace n_rooms == -1 with mode of n_rooms
housedf_clean$n_rooms[which(housedf$n_rooms == -1)] <- mlv(housedf_clean$n_rooms, method = "mfv")
# Correcting heating
housedf_clean$heating[housedf_clean$heating=='autonamous'] <- 'autonomous'
# Alternate corrections
# replace mq < 5, with mean of mq
#housedf_clean$mq[which(housedf_clean$mq <5)] <- mean(housedf_clean$mq)
# Replace n_rooms = -1 with positive value
#housedf_clean$n_rooms <- abs(housedf_clean$n_rooms)
# Converting the categorical columns into factors
categorical_vars <- c("floor", "n_rooms", "n_bathrooms", "has_terrace","has_alarm","heating","has_air_conditioning", "has_parking", "is_furnished")
for(var in categorical_vars){
housedf_clean[,var] <- as.factor(housedf_clean[,var])
}
# Checking data types and summary for housedf_clean
summary(housedf_clean)
## id price mq floor n_rooms n_bathrooms
## Min. : 84 Min. : 1000 Min. : 1.0 1:453 2:134 1:556
## 1st Qu.: 53898 1st Qu.: 79000 1st Qu.: 78.0 2:294 3:354 2:311
## Median :113063 Median :125000 Median :100.0 3:107 4:254 3: 36
## Mean :111010 Mean :144164 Mean :115.9 4: 28 5:161
## 3rd Qu.:167181 3rd Qu.:187500 3rd Qu.:135.0 5: 13
## Max. :223309 Max. :500000 Max. :840.0 6: 3
## 7: 5
## has_terrace has_alarm heating has_air_conditioning has_parking
## 0:789 0:893 autonomous:789 0:613 0:887
## 1:114 1: 10 other :114 1:290 1: 16
##
##
##
##
##
## is_furnished
## 0:831
## 1: 72
##
##
##
##
##
To perform exploratory data analysis on our data, we will follow the following steps:
From the above exploratory data analysis, the following conclusions can be made:
floor with heating and has_air_conditioning
n_rooms with n_bathrooms
n_bathrooms with has_terrace, has_alarm, and heating
has_terrace with has_air_conditioning, has_parking, is_furnished and has_air_conditioning
has_alarm with has_air_conditioning
has_air_conditioning with has_parking and is_furnished
Errors occurred during the test due to non-normal distribution of the variables. We are using simulate.p.value = T for ignoring those errors.
# Defining a custom function mysummary() to show variable summary
# and visualise the data in the form of histogram and bar plot
mysummary <- function(var, var_name){
print(summary(var))
# For numerical variables
if(class(var) == "numeric" || class(var) == "integer"){
# Histogram with density line
print(ggplot(housedf_clean, aes(x = var)) +
geom_histogram(aes(y=..density..), color="#111111", fill="#555555") +
geom_density(alpha=.2, fill="#00C0F0") +
labs(title = paste("Distribution for", var_name), x = var_name, y = "count"))
# Skewness
print(skewness(var))
# QQplot
qqPlot(var, distribution="norm", ylab = var_name, main = paste("Quantile Quantile Plot for",var_name), col.lines = "#006080")
}
# For Categorical Variables
else{
freq.df <- as.data.frame(table(var))
# Bar Plot
print(ggplot(data = housedf_clean) + aes(x=var) + geom_bar(color="#111111", fill="#555555") +
geom_text(data = freq.df, aes(x = var, y = Freq + max(Freq)/50, label = Freq), size=3) +
xlab(var_name) + ylab("Frequency") + ggtitle(paste("Distribution for", var_name)))
}
}
# Analysing Individual Variable
# Analysing price
mysummary(housedf_clean$price, var_name = "price")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1000 79000 125000 144164 187500 500000
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] 1.233913
## [1] 360 503
# Analysing mq
mysummary(housedf_clean$mq, var_name = "mq")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 78.0 100.0 115.9 135.0 840.0
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] 4.546405
## [1] 104 413
# Analysing floor
mysummary(housedf_clean$floor, var_name = "floor")
## 1 2 3 4 5 6 7
## 453 294 107 28 13 3 5
# Analysing n_rooms
mysummary(housedf_clean$n_rooms, var_name = "n_rooms")
## 2 3 4 5
## 134 354 254 161
# Analysing n_bathrooms
mysummary(housedf_clean$n_bathrooms, var_name = "n_bathrooms")
## 1 2 3
## 556 311 36
# Analysing has_terrace
mysummary(housedf_clean$has_terrace, var_name = "has_terrace")
## 0 1
## 789 114
# Analysing has_alarm
mysummary(housedf_clean$has_alarm, var_name = "has_alarm")
## 0 1
## 893 10
# Analysing heating
mysummary(housedf_clean$heating, var_name = "heating")
## autonomous other
## 789 114
# Analysing has_air_conditioning
mysummary(housedf_clean$has_air_conditioning, var_name = "has_air_conditioning")
## 0 1
## 613 290
# Analysing has_parking
mysummary(housedf_clean$has_parking, var_name = "has_parking")
## 0 1
## 887 16
# Analysing is_furnished
mysummary(housedf_clean$is_furnished, var_name = "is_furnished")
## 0 1
## 831 72
# Checking correlation between numerical columns
# Checking correlation between the price and mq
cor.test(housedf_clean$price, housedf_clean$mq)
##
## Pearson's product-moment correlation
##
## data: housedf_clean$price and housedf_clean$mq
## t = 10.036, df = 901, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2571621 0.3745726
## sample estimates:
## cor
## 0.3170817
# Scatter plots for dependency between price and mq
ggplot(data = housedf_clean) + aes(x=price/100000, y=mq) + geom_point() +
labs(title = "Distribution of mq vs price", x="Price in 100k", y="Area in Meter Square") +
geom_smooth(method = "lm", color = "#006080")
## `geom_smooth()` using formula 'y ~ x'
# Separate floor from rest of categorical variables for using fisher.test
# as the frequency of floor6, floor7 is 3 and 5
remaining.categorical <- categorical_vars[!categorical_vars == "floor"]
# Checking dependence among floor and remaining categorical variables using fisher.exact
# Perform the fisher.exact test of independence for floor and other categorical columns
for (j in 1:length(remaining.categorical)) {
fisher.result <- fisher.test(table(housedf_clean$floor, housedf_clean[,remaining.categorical[j]]), simulate.p.value = T)
if(fisher.result$p.value<0.05){
cat("\n")
print(paste("Dependance between floor and", remaining.categorical[j], "is significant with a p-value of", round(fisher.result$p.value,4)))
}
}
##
## [1] "Dependance between floor and heating is significant with a p-value of 5e-04"
##
## [1] "Dependance between floor and has_air_conditioning is significant with a p-value of 0.034"
# Checking dependence among remaining categorical variables using chi-sq test
# Perform the chi-squared test of independence for each pair of columns
for (i in 1:(length(remaining.categorical)-1)) {
for (j in (i+1):length(remaining.categorical)) {
chisq.result <- chisq.test(housedf_clean[,remaining.categorical[i]], housedf_clean[,remaining.categorical[j]], simulate.p.value = T)
#print(paste("Chi-Square Test between",columns[i], "and", columns[j], "gives p-value of", round(chisq.result$p.value,3)))
if(chisq.result$p.value<0.05){
cat("\n")
print(paste("Dependance between",remaining.categorical[i], "and", remaining.categorical[j], "is significant with a p-value of", round(chisq.result$p.value,4)))
}
}
}
##
## [1] "Dependance between n_rooms and n_bathrooms is significant with a p-value of 5e-04"
##
## [1] "Dependance between n_bathrooms and has_terrace is significant with a p-value of 0.003"
##
## [1] "Dependance between n_bathrooms and has_alarm is significant with a p-value of 0.04"
##
## [1] "Dependance between n_bathrooms and heating is significant with a p-value of 0.0325"
##
## [1] "Dependance between has_terrace and has_air_conditioning is significant with a p-value of 5e-04"
##
## [1] "Dependance between has_terrace and has_parking is significant with a p-value of 0.0085"
##
## [1] "Dependance between has_terrace and is_furnished is significant with a p-value of 0.0205"
##
## [1] "Dependance between has_alarm and has_air_conditioning is significant with a p-value of 0.0045"
##
## [1] "Dependance between has_air_conditioning and has_parking is significant with a p-value of 0.0015"
##
## [1] "Dependance between has_air_conditioning and is_furnished is significant with a p-value of 5e-04"
# Defining a custom function compare.category() to check the following
# for all (numerical variable, categorical variables) pairs in the data:
# 1) Aggregate mean for numerical variable across different categories of the categorical variable
# 2) ANOVA testing for difference in average value of numerical variable across different categories
# 3) Box Plot indicating the spread of numerical variable across different categories
compare.category <- function(num_var, cat_var){
# Creating custom formula for categorical variable ~ numerical variable
compare.formula <- as.formula(paste0(num_var," ~ ",cat_var))
# Aggregate mean for numerical variable vs categorical variable
print(paste("Aggregate Mean of",num_var,"between categories of",cat_var))
print(aggregate(compare.formula, data = housedf_clean, FUN="mean"))
cat("\n\n")
# ANOVA test to check if there's significant difference in average of the numerical variable of different categories
print(paste("ANOVA Test for",num_var,"~",cat_var))
print(summary.lm(aov(compare.formula, data=housedf_clean)))
# Auxiliary code for aesthetics and scaling
x_var <- housedf_clean[,cat_var]
y_var <- housedf_clean[,num_var]
box_col <- "#FF6080"
y_label <- num_var
if(num_var == "price"){
y_var <- y_var / 100000
y_label <- paste(num_var,"(in 100k)")
box_col <- "#006080"
}
print(paste("Box Plot for",num_var,"vs",cat_var))
# Box plot for price vs categorical
print(ggplot(data = housedf_clean) + aes(x = x_var, y = y_var) +
geom_boxplot(notch = F, color = box_col) +
labs(title = paste("Box Plot for",num_var,"distribution across",cat_var,"categories"), x = cat_var, y = y_label))
}
# Aggregate Mean, ANOVA Test, and Boxplot for price vs floor
compare.category(num_var = "price", cat_var = "floor")
## [1] "Aggregate Mean of price between categories of floor"
## floor price
## 1 1 146601.9
## 2 2 142478.3
## 3 3 130980.4
## 4 4 156803.6
## 5 5 152384.6
## 6 6 191333.3
## 7 7 184000.0
##
##
## [1] "ANOVA Test for price ~ floor"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145602 -66703 -20602 42522 359020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 146602 4408 33.255 <2e-16 ***
## floor2 -4124 7027 -0.587 0.557
## floor3 -15622 10085 -1.549 0.122
## floor4 10202 18271 0.558 0.577
## floor5 5783 26394 0.219 0.827
## floor6 44731 54350 0.823 0.411
## floor7 37398 42192 0.886 0.376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93830 on 896 degrees of freedom
## Multiple R-squared: 0.005307, Adjusted R-squared: -0.001354
## F-statistic: 0.7968 on 6 and 896 DF, p-value: 0.5725
##
## [1] "Box Plot for price vs floor"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs floor
compare.category(num_var = "mq", cat_var = "floor")
## [1] "Aggregate Mean of mq between categories of floor"
## floor mq
## 1 1 117.36424
## 2 2 116.41156
## 3 3 111.80374
## 4 4 113.14286
## 5 5 91.69231
## 6 6 92.33333
## 7 7 134.80000
##
##
## [1] "ANOVA Test for mq ~ floor"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.36 -38.36 -17.36 18.59 722.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.3642 3.5497 33.063 <2e-16 ***
## floor2 -0.9527 5.6583 -0.168 0.866
## floor3 -5.5605 8.1208 -0.685 0.494
## floor4 -4.2214 14.7126 -0.287 0.774
## floor5 -25.6719 21.2529 -1.208 0.227
## floor6 -25.0309 43.7642 -0.572 0.567
## floor7 17.4358 33.9739 0.513 0.608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.55 on 896 degrees of freedom
## Multiple R-squared: 0.002755, Adjusted R-squared: -0.003923
## F-statistic: 0.4125 on 6 and 896 DF, p-value: 0.871
##
## [1] "Box Plot for mq vs floor"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs n_rooms
compare.category(num_var = "price", cat_var = "n_rooms")
## [1] "Aggregate Mean of price between categories of n_rooms"
## n_rooms price
## 1 2 96179.75
## 2 3 138795.45
## 3 4 157038.19
## 4 5 175591.93
##
##
## [1] "ANOVA Test for price ~ n_rooms"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150592 -64488 -17038 42962 361205
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96180 7843 12.263 < 2e-16 ***
## n_rooms3 42616 9208 4.628 4.24e-06 ***
## n_rooms4 60858 9694 6.278 5.32e-10 ***
## n_rooms5 79412 10616 7.480 1.77e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90790 on 899 degrees of freedom
## Multiple R-squared: 0.06556, Adjusted R-squared: 0.06244
## F-statistic: 21.02 on 3 and 899 DF, p-value: 3.614e-13
##
## [1] "Box Plot for price vs n_rooms"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs n_rooms
compare.category(num_var = "mq", cat_var = "n_rooms")
## [1] "Aggregate Mean of mq between categories of n_rooms"
## n_rooms mq
## 1 2 68.49254
## 2 3 95.63559
## 3 4 134.70866
## 4 5 170.28571
##
##
## [1] "ANOVA Test for mq ~ n_rooms"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.29 -28.56 -10.49 10.33 669.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.493 5.845 11.718 < 2e-16 ***
## n_rooms3 27.143 6.863 3.955 8.26e-05 ***
## n_rooms4 66.216 7.224 9.166 < 2e-16 ***
## n_rooms5 101.793 7.912 12.865 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.66 on 899 degrees of freedom
## Multiple R-squared: 0.1974, Adjusted R-squared: 0.1948
## F-statistic: 73.72 on 3 and 899 DF, p-value: < 2.2e-16
##
## [1] "Box Plot for mq vs n_rooms"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs n_bathrooms
compare.category(num_var = "price", cat_var = "n_bathrooms")
## [1] "Aggregate Mean of price between categories of n_bathrooms"
## n_bathrooms price
## 1 1 118788.2
## 2 2 178284.4
## 3 3 241305.6
##
##
## [1] "ANOVA Test for price ~ n_bathrooms"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -211306 -58838 -19284 41212 381212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118788 3705 32.059 < 2e-16 ***
## n_bathrooms2 59496 6187 9.617 < 2e-16 ***
## n_bathrooms3 122517 15026 8.154 1.18e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 87370 on 900 degrees of freedom
## Multiple R-squared: 0.1336, Adjusted R-squared: 0.1317
## F-statistic: 69.42 on 2 and 900 DF, p-value: < 2.2e-16
##
## [1] "Box Plot for price vs n_bathrooms"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs n_bathrooms
compare.category(num_var = "mq", cat_var = "n_bathrooms")
## [1] "Aggregate Mean of mq between categories of n_bathrooms"
## n_bathrooms mq
## 1 1 99.49281
## 2 2 138.23473
## 3 3 176.55556
##
##
## [1] "ANOVA Test for mq ~ n_bathrooms"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -126.56 -34.49 -13.23 10.64 740.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.493 3.062 32.494 < 2e-16 ***
## n_bathrooms2 38.742 5.112 7.578 8.71e-14 ***
## n_bathrooms3 77.063 12.417 6.206 8.26e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72.2 on 900 degrees of freedom
## Multiple R-squared: 0.08526, Adjusted R-squared: 0.08323
## F-statistic: 41.94 on 2 and 900 DF, p-value: < 2.2e-16
##
## [1] "Box Plot for mq vs n_bathrooms"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs has_terrace
compare.category(num_var = "price", cat_var = "has_terrace")
## [1] "Aggregate Mean of price between categories of has_terrace"
## has_terrace price
## 1 0 140104.9
## 2 1 172253.9
##
##
## [1] "ANOVA Test for price ~ has_terrace"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150254 -65105 -20105 44895 359895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 140105 3318 42.224 < 2e-16 ***
## has_terrace1 32149 9339 3.443 0.000603 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93200 on 901 degrees of freedom
## Multiple R-squared: 0.01298, Adjusted R-squared: 0.01189
## F-statistic: 11.85 on 1 and 901 DF, p-value: 0.0006028
##
## [1] "Box Plot for price vs has_terrace"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs n_rooms
compare.category(num_var = "mq", cat_var = "has_terrace")
## [1] "Aggregate Mean of mq between categories of has_terrace"
## has_terrace mq
## 1 0 115.2129
## 2 1 120.7193
##
##
## [1] "ANOVA Test for mq ~ has_terrace"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.21 -39.72 -17.72 19.79 724.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.213 2.685 42.907 <2e-16 ***
## has_terrace1 5.506 7.557 0.729 0.466
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.42 on 901 degrees of freedom
## Multiple R-squared: 0.0005889, Adjusted R-squared: -0.0005203
## F-statistic: 0.5309 on 1 and 901 DF, p-value: 0.4664
##
## [1] "Box Plot for mq vs has_terrace"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs has_alarm
compare.category(num_var = "price", cat_var = "has_alarm")
## [1] "Aggregate Mean of price between categories of has_alarm"
## has_alarm price
## 1 0 143134
## 2 1 236100
##
##
## [1] "ANOVA Test for price ~ has_alarm"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142134 -65134 -18134 43866 356866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143134 3122 45.840 < 2e-16 ***
## has_alarm1 92966 29672 3.133 0.00179 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93310 on 901 degrees of freedom
## Multiple R-squared: 0.01078, Adjusted R-squared: 0.00968
## F-statistic: 9.817 on 1 and 901 DF, p-value: 0.001785
##
## [1] "Box Plot for price vs has_alarm"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs has_alarm
compare.category(num_var = "mq", cat_var = "has_alarm")
## [1] "Aggregate Mean of mq between categories of has_alarm"
## has_alarm mq
## 1 0 115.6439
## 2 1 139.5000
##
##
## [1] "ANOVA Test for mq ~ has_alarm"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.64 -38.64 -15.64 19.36 724.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.644 2.523 45.830 <2e-16 ***
## has_alarm1 23.856 23.978 0.995 0.32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.4 on 901 degrees of freedom
## Multiple R-squared: 0.001097, Adjusted R-squared: -1.126e-05
## F-statistic: 0.9898 on 1 and 901 DF, p-value: 0.32
##
## [1] "Box Plot for mq vs has_alarm"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs heating
compare.category(num_var = "price", cat_var = "heating")
## [1] "Aggregate Mean of price between categories of heating"
## heating price
## 1 autonomous 143514.2
## 2 other 148657.9
##
##
## [1] "ANOVA Test for price ~ heating"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142514 -64514 -19658 43486 356486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143514 3339 42.976 <2e-16 ***
## heatingother 5144 9398 0.547 0.584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93800 on 901 degrees of freedom
## Multiple R-squared: 0.0003323, Adjusted R-squared: -0.0007772
## F-statistic: 0.2995 on 1 and 901 DF, p-value: 0.5843
##
## [1] "Box Plot for price vs heating"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs heating
compare.category(num_var = "mq", cat_var = "heating")
## [1] "Aggregate Mean of mq between categories of heating"
## heating mq
## 1 autonomous 118.56654
## 2 other 97.50877
##
##
## [1] "ANOVA Test for mq ~ heating"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -117.57 -38.57 -18.57 18.43 721.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118.567 2.674 44.335 < 2e-16 ***
## heatingother -21.058 7.527 -2.798 0.00526 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.12 on 901 degrees of freedom
## Multiple R-squared: 0.008612, Adjusted R-squared: 0.007512
## F-statistic: 7.827 on 1 and 901 DF, p-value: 0.005257
##
## [1] "Box Plot for mq vs heating"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs has_air_conditioning
compare.category(num_var = "price", cat_var = "has_air_conditioning")
## [1] "Aggregate Mean of price between categories of has_air_conditioning"
## has_air_conditioning price
## 1 0 141823.5
## 2 1 149109.8
##
##
## [1] "ANOVA Test for price ~ has_air_conditioning"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -140824 -66824 -21824 45176 358176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141824 3787 37.45 <2e-16 ***
## has_air_conditioning1 7286 6682 1.09 0.276
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93750 on 901 degrees of freedom
## Multiple R-squared: 0.001318, Adjusted R-squared: 0.0002096
## F-statistic: 1.189 on 1 and 901 DF, p-value: 0.2758
##
## [1] "Box Plot for price vs has_air_conditioning"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs has_air_conditioning
compare.category(num_var = "mq", cat_var = "has_air_conditioning")
## [1] "Aggregate Mean of mq between categories of has_air_conditioning"
## has_air_conditioning mq
## 1 0 116.1811
## 2 1 115.3310
##
##
## [1] "ANOVA Test for mq ~ has_air_conditioning"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.18 -38.18 -16.18 18.82 723.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 116.181 3.047 38.127 <2e-16 ***
## has_air_conditioning1 -0.850 5.377 -0.158 0.874
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.45 on 901 degrees of freedom
## Multiple R-squared: 2.774e-05, Adjusted R-squared: -0.001082
## F-statistic: 0.02499 on 1 and 901 DF, p-value: 0.8744
##
## [1] "Box Plot for mq vs has_air_conditioning"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs has_parking
compare.category(num_var = "price", cat_var = "has_parking")
## [1] "Aggregate Mean of price between categories of has_parking"
## has_parking price
## 1 0 144110.1
## 2 1 147125.0
##
##
## [1] "ANOVA Test for price ~ has_parking"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143110 -65110 -19110 43390 355890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 144110 3150 45.749 <2e-16 ***
## has_parking1 3015 23664 0.127 0.899
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93810 on 901 degrees of freedom
## Multiple R-squared: 1.801e-05, Adjusted R-squared: -0.001092
## F-statistic: 0.01623 on 1 and 901 DF, p-value: 0.8987
##
## [1] "Box Plot for price vs has_parking"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs has_parking
compare.category(num_var = "mq", cat_var = "has_parking")
## [1] "Aggregate Mean of mq between categories of has_parking"
## has_parking mq
## 1 0 115.7813
## 2 1 122.9375
##
##
## [1] "ANOVA Test for mq ~ has_parking"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.78 -38.78 -15.78 19.22 724.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.781 2.533 45.709 <2e-16 ***
## has_parking1 7.156 19.029 0.376 0.707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.44 on 901 degrees of freedom
## Multiple R-squared: 0.0001569, Adjusted R-squared: -0.0009528
## F-statistic: 0.1414 on 1 and 901 DF, p-value: 0.707
##
## [1] "Box Plot for mq vs has_parking"
# Aggregate Mean, ANOVA Test, and Boxplot for price vs is_furnished
compare.category(num_var = "price", cat_var = "is_furnished")
## [1] "Aggregate Mean of price between categories of is_furnished"
## is_furnished price
## 1 0 143157.3
## 2 1 155777.8
##
##
## [1] "ANOVA Test for price ~ is_furnished"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142157 -66778 -20778 43843 356843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143157 3252 44.018 <2e-16 ***
## is_furnished1 12620 11518 1.096 0.273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93750 on 901 degrees of freedom
## Multiple R-squared: 0.001331, Adjusted R-squared: 0.0002224
## F-statistic: 1.201 on 1 and 901 DF, p-value: 0.2735
##
## [1] "Box Plot for price vs is_furnished"
# Aggregate Mean, ANOVA Test, and Boxplot for mq vs is_furnished
compare.category(num_var = "mq", cat_var = "is_furnished")
## [1] "Aggregate Mean of mq between categories of is_furnished"
## is_furnished mq
## 1 0 115.7665
## 2 1 117.5417
##
##
## [1] "ANOVA Test for mq ~ is_furnished"
##
## Call:
## aov(formula = compare.formula, data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.77 -38.27 -17.54 19.23 724.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.767 2.617 44.234 <2e-16 ***
## is_furnished1 1.775 9.268 0.192 0.848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.44 on 901 degrees of freedom
## Multiple R-squared: 4.071e-05, Adjusted R-squared: -0.001069
## F-statistic: 0.03668 on 1 and 901 DF, p-value: 0.8482
##
## [1] "Box Plot for mq vs is_furnished"
Some insights and potential issues from the above analysis are:
The skewed distribution of price and mq may suggest the presence of extreme or abnormal values i.e outliers that could be influencing the overall trend.
The significant dependence between certain pairs of categorical variables, as indicated by the Chi-Square test, may indicate a relationship between these variables, and could potentially allow for predictions to be made about one variable based on the values of the other.
The significant effect of the number of rooms and bathrooms on both the price and size of a house indicates the importance of these factors in influencing the value and area of the property.
The significant differences in price between houses with and without certain features (such as terrace or alarm) indicates that these features are highly valued by buyers and could be used as selling points. Meanwhile, the significant differences in area for different heating groups indicate that heating may determinine property area.
The lack of significant differences or meaningful trends in price or size across groups of floor, has_parking, has_air_conditioning and is_furnished indicates that these variables are not as influential on the overall price or size of a property.
Given the research question regarding property price, an analysis plan would include the following steps:
Model Selection: Since the dependent variable is numerical, and the explanatory variables are a mix of categorical and numerical, we will use ANCOVA or multiple linear regression model for our research question.
Variable Selection: We’ll start with including variables that have a significant effect on the property price such as mq, n_bathrooms, has_terrace, has_alarm and heating. An alternate way that we’re following in our modelling would be to build a maximal model that incorporates all the variables in our dataset as it allows us to determine maximum potential performance for our model.
Variable Augmentation: Next, we’ll add additional variables to the model through transformation and interactions, as these new variables would capture non-linear relationships or higher order interactions that the original variables did not capture. This would also help with reduce the skewness of non-normally distributed variables like mq.
Fitting the model and evaluating the performance: We’ll fit the model using the selected variables and evaluate the performance through metrics such as R-squared, significant columns (p-value < 0.05), residuals and diagnostic plots.
Updating the model: Once we check the performance of the model, we’ll update the model to remove non-significant variables while retaining the significant ones. We use step() function for this as it updates the model in one go and then evaluate the performance of the significant model. We’ll also check multicollinearity between variables using vif() method (For more information on calculating variable inflation factor, see the following resource: [@vif]).
Interpreting model coefficients: Finally, we’ll analyse the coefficients to understand the impact of each variable on the property price.
# Building Maximal Model
price.max.model <- lm(formula = price ~ ., data = housedf_clean)
# Summary of the model
summary(price.max.model)
##
## Call:
## lm(formula = price ~ ., data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -183023 -56510 -14271 40666 389396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.436e+04 1.005e+04 6.401 2.51e-10 ***
## id 4.831e-02 4.360e-02 1.108 0.26812
## mq 2.816e+02 4.204e+01 6.699 3.74e-11 ***
## floor2 -3.613e+03 6.331e+03 -0.571 0.56832
## floor3 -1.364e+04 9.142e+03 -1.492 0.13593
## floor4 8.035e+03 1.657e+04 0.485 0.62791
## floor5 1.802e+04 2.386e+04 0.755 0.45038
## floor6 4.822e+04 4.916e+04 0.981 0.32696
## floor7 2.999e+04 3.803e+04 0.789 0.43061
## n_rooms3 2.514e+04 8.726e+03 2.882 0.00405 **
## n_rooms4 1.889e+04 9.778e+03 1.932 0.05365 .
## n_rooms5 1.791e+04 1.120e+04 1.598 0.11031
## n_bathrooms2 4.581e+04 6.554e+03 6.989 5.46e-12 ***
## n_bathrooms3 1.007e+05 1.517e+04 6.639 5.51e-11 ***
## has_terrace1 2.071e+04 8.734e+03 2.372 0.01791 *
## has_alarm1 6.488e+04 2.710e+04 2.394 0.01686 *
## heatingother 1.679e+04 8.696e+03 1.931 0.05382 .
## has_air_conditioning1 -1.776e+03 6.284e+03 -0.283 0.77760
## has_parking1 5.230e+03 2.160e+04 0.242 0.80876
## is_furnished1 7.953e+03 1.057e+04 0.752 0.45207
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 84060 on 883 degrees of freedom
## Multiple R-squared: 0.2132, Adjusted R-squared: 0.1963
## F-statistic: 12.59 on 19 and 883 DF, p-value: < 2.2e-16
# Updating the model using transformation of mq variable
price.max.model <- update(price.max.model, . ~ . + log(mq))
# Summary of the updated model
summary(price.max.model)
##
## Call:
## lm(formula = price ~ id + mq + floor + n_rooms + n_bathrooms +
## has_terrace + has_alarm + heating + has_air_conditioning +
## has_parking + is_furnished + log(mq), data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -184113 -57482 -14697 40618 389728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.914e+03 4.281e+04 0.091 0.92718
## id 4.585e-02 4.360e-02 1.051 0.29335
## mq 2.031e+02 6.842e+01 2.969 0.00307 **
## floor2 -4.068e+03 6.334e+03 -0.642 0.52090
## floor3 -1.397e+04 9.139e+03 -1.529 0.12668
## floor4 7.140e+03 1.657e+04 0.431 0.66672
## floor5 1.764e+04 2.385e+04 0.740 0.45967
## floor6 4.711e+04 4.914e+04 0.959 0.33800
## floor7 2.840e+04 3.802e+04 0.747 0.45534
## n_rooms3 2.152e+04 9.070e+03 2.373 0.01784 *
## n_rooms4 1.362e+04 1.042e+04 1.307 0.19157
## n_rooms5 1.246e+04 1.181e+04 1.056 0.29139
## n_bathrooms2 4.380e+04 6.695e+03 6.542 1.03e-10 ***
## n_bathrooms3 9.772e+04 1.530e+04 6.388 2.72e-10 ***
## has_terrace1 2.027e+04 8.733e+03 2.322 0.02048 *
## has_alarm1 6.448e+04 2.708e+04 2.381 0.01749 *
## heatingother 1.653e+04 8.693e+03 1.902 0.05755 .
## has_air_conditioning1 -1.528e+03 6.283e+03 -0.243 0.80794
## has_parking1 6.357e+03 2.161e+04 0.294 0.76865
## is_furnished1 7.574e+03 1.057e+04 0.717 0.47377
## log(mq) 1.617e+04 1.114e+04 1.452 0.14674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 84010 on 882 degrees of freedom
## Multiple R-squared: 0.2151, Adjusted R-squared: 0.1973
## F-statistic: 12.08 on 20 and 882 DF, p-value: < 2.2e-16
# Updating the model using step() method
price.model <- step(price.max.model)
## Start: AIC=20498.36
## price ~ id + mq + floor + n_rooms + n_bathrooms + has_terrace +
## has_alarm + heating + has_air_conditioning + has_parking +
## is_furnished + log(mq)
##
## Df Sum of Sq RSS AIC
## - floor 6 3.7057e+10 6.2614e+12 20492
## - has_air_conditioning 1 4.1726e+08 6.2248e+12 20496
## - has_parking 1 6.1094e+08 6.2250e+12 20496
## - is_furnished 1 3.6246e+09 6.2280e+12 20497
## - id 1 7.8016e+09 6.2322e+12 20498
## <none> 6.2244e+12 20498
## - log(mq) 1 1.4887e+10 6.2393e+12 20498
## - n_rooms 3 4.3513e+10 6.2679e+12 20499
## - heating 1 2.5519e+10 6.2499e+12 20500
## - has_terrace 1 3.8036e+10 6.2624e+12 20502
## - has_alarm 1 3.9996e+10 6.2644e+12 20502
## - mq 1 6.2214e+10 6.2866e+12 20505
## - n_bathrooms 2 4.6995e+11 6.6943e+12 20560
##
## Step: AIC=20491.72
## price ~ id + mq + n_rooms + n_bathrooms + has_terrace + has_alarm +
## heating + has_air_conditioning + has_parking + is_furnished +
## log(mq)
##
## Df Sum of Sq RSS AIC
## - has_parking 1 2.2217e+08 6.2617e+12 20490
## - has_air_conditioning 1 2.5584e+08 6.2617e+12 20490
## - is_furnished 1 5.5116e+09 6.2670e+12 20490
## - id 1 8.9765e+09 6.2704e+12 20491
## <none> 6.2614e+12 20492
## - log(mq) 1 1.5108e+10 6.2766e+12 20492
## - n_rooms 3 4.6168e+10 6.3076e+12 20492
## - heating 1 2.7413e+10 6.2889e+12 20494
## - has_alarm 1 3.8856e+10 6.3003e+12 20495
## - has_terrace 1 4.0446e+10 6.3019e+12 20496
## - mq 1 6.2864e+10 6.3243e+12 20499
## - n_bathrooms 2 4.6637e+11 6.7278e+12 20553
##
## Step: AIC=20489.75
## price ~ id + mq + n_rooms + n_bathrooms + has_terrace + has_alarm +
## heating + has_air_conditioning + is_furnished + log(mq)
##
## Df Sum of Sq RSS AIC
## - has_air_conditioning 1 2.0815e+08 6.2619e+12 20488
## - is_furnished 1 5.4513e+09 6.2671e+12 20488
## - id 1 9.0811e+09 6.2708e+12 20489
## <none> 6.2617e+12 20490
## - log(mq) 1 1.4991e+10 6.2767e+12 20490
## - n_rooms 3 4.6114e+10 6.3078e+12 20490
## - heating 1 2.7683e+10 6.2894e+12 20492
## - has_alarm 1 3.8741e+10 6.3004e+12 20493
## - has_terrace 1 4.1304e+10 6.3030e+12 20494
## - mq 1 6.3308e+10 6.3250e+12 20497
## - n_bathrooms 2 4.6661e+11 6.7283e+12 20551
##
## Step: AIC=20487.78
## price ~ id + mq + n_rooms + n_bathrooms + has_terrace + has_alarm +
## heating + is_furnished + log(mq)
##
## Df Sum of Sq RSS AIC
## - is_furnished 1 5.2622e+09 6.2671e+12 20486
## - id 1 9.1197e+09 6.2710e+12 20487
## <none> 6.2619e+12 20488
## - log(mq) 1 1.5125e+10 6.2770e+12 20488
## - n_rooms 3 4.5974e+10 6.3079e+12 20488
## - heating 1 2.7520e+10 6.2894e+12 20490
## - has_alarm 1 3.8587e+10 6.3005e+12 20491
## - has_terrace 1 4.1516e+10 6.3034e+12 20492
## - mq 1 6.3196e+10 6.3251e+12 20495
## - n_bathrooms 2 4.6654e+11 6.7284e+12 20549
##
## Step: AIC=20486.54
## price ~ id + mq + n_rooms + n_bathrooms + has_terrace + has_alarm +
## heating + log(mq)
##
## Df Sum of Sq RSS AIC
## - id 1 8.8659e+09 6.2760e+12 20486
## <none> 6.2671e+12 20486
## - log(mq) 1 1.5465e+10 6.2826e+12 20487
## - n_rooms 3 4.7527e+10 6.3147e+12 20487
## - heating 1 2.6078e+10 6.2932e+12 20488
## - has_alarm 1 3.8945e+10 6.3061e+12 20490
## - has_terrace 1 4.4244e+10 6.3114e+12 20491
## - mq 1 6.2420e+10 6.3296e+12 20494
## - n_bathrooms 2 4.6413e+11 6.7313e+12 20547
##
## Step: AIC=20485.82
## price ~ mq + n_rooms + n_bathrooms + has_terrace + has_alarm +
## heating + log(mq)
##
## Df Sum of Sq RSS AIC
## <none> 6.2760e+12 20486
## - log(mq) 1 1.6402e+10 6.2924e+12 20486
## - n_rooms 3 4.8658e+10 6.3247e+12 20487
## - heating 1 2.6919e+10 6.3029e+12 20488
## - has_alarm 1 3.9978e+10 6.3160e+12 20490
## - has_terrace 1 4.4969e+10 6.3210e+12 20490
## - mq 1 5.9495e+10 6.3355e+12 20492
## - n_bathrooms 2 4.5917e+11 6.7352e+12 20546
# Summary of the model
summary(price.model)
##
## Call:
## lm(formula = price ~ mq + n_rooms + n_bathrooms + has_terrace +
## has_alarm + heating + log(mq), data = housedf_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190684 -58003 -14541 39935 383769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3807.50 42428.86 0.090 0.92852
## mq 197.58 67.95 2.908 0.00373 **
## n_rooms3 22546.44 8996.05 2.506 0.01238 *
## n_rooms4 14134.88 10338.57 1.367 0.17191
## n_rooms5 12803.13 11731.20 1.091 0.27540
## n_bathrooms2 43308.89 6659.28 6.504 1.31e-10 ***
## n_bathrooms3 95665.78 15228.22 6.282 5.21e-10 ***
## has_terrace1 21427.56 8475.73 2.528 0.01164 *
## has_alarm1 63979.13 26840.19 2.384 0.01735 *
## heatingother 16574.03 8473.47 1.956 0.05078 .
## log(mq) 16903.30 11070.92 1.527 0.12716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83880 on 892 degrees of freedom
## Multiple R-squared: 0.2086, Adjusted R-squared: 0.1997
## F-statistic: 23.51 on 10 and 892 DF, p-value: < 2.2e-16
# Diagnostic plots for the final model
plot(price.model)
# Checking multi-collinearity in the model parameters
# No multi-collinearity as the value of VIF in the output is almost equal to 1
vif(price.model)
## GVIF Df GVIF^(1/(2*Df))
## mq 3.365315 1 1.834480
## n_rooms 1.650749 3 1.087127
## n_bathrooms 1.340342 2 1.075979
## has_terrace 1.017021 1 1.008475
## has_alarm 1.012550 1 1.006255
## heating 1.016480 1 1.008206
## log(mq) 4.132373 1 2.032824
The linear regression equation from our significant model is
\(price = 197.58 * mq + 22546.44 * n_rooms3 + 14134.88 * n_rooms4 + 12803.13 * n_rooms5 + 43308.89 * n_bathrooms2 + 95665.78 * n_bathrooms3 + 21427.56 * has_terrace1 + 63979.13 * has_alarm1 + 16574.03 * heatingother + 16903.30 * log(mq) + 3807.50\)
The above is a linear regression model to predict the price of the property based on factors listed in the coefficients. We are using step function to remove insignificant dependent variables from our model. On the basis of the final model summary, the following observations can be made:
In our final model, the independent variables influencing price in a significant way are mq, n_rooms, n_bathrooms, has_terrace, has_alarm & heating. This is indicated by the p-value of the corresponding coefficients being less than 0.05. The variable log(mq) has a p-value of 0.1276 > 0.05, implying the relationship between log(mq) and price is not statistically significant at 0.05 level. However, this doesn’t rule out any affect of the variable on the response variable price.
The R-squared value that represents the goodness of fit is 0.2086, which means the model explains about 21% of the variance in the response variable, price.
The residuals which represent the difference between the observed values for price and the predicted values from the model, have a large overall range (from -190684 to 383769) suggesting that the model is not fitting the data accurately.
The overall p-value for the model is < 2.2e-16, indicating the model is statistically significant.
The Residual vs Fitted plot and the upward slope in Scale-Location plot indicate heteroscedasticity i.e. the variance of the residuals is not constant. The distribution of residuals is relatively concentrated around the lower fitted values, and gets increasingly spread out till around 200000, after which the distribution of observations decreases.
The banana-shaped curve on the normal Q-Q plot indicates the residuals are not normally distributed and the distribution is skewed.
The Residuals vs Leverage plot shows some outliers but no highly influential case beyond the cook’s distance.
There is no evidence of high multicollinearity between the independent variables as the value of Variance Inflation Factor (VIF) for each variable nears 1.
Potential weaknesses of the model include
The relatively low R-squared value at 0.2086, indicating the model explains only 21% of the variance in price, which may not be enough to make accurate predictions. This could also be an indication of other factors influencing the price of the property that were not originally mentioned in the provided data.
The residuals have a large range, suggesting that the model is not accurately fitting the data. This may be caused by outliers, nonlinear relationships, or other factors that the model is unable to capture.
The presence of heteroscedasticity and influential outliers could cause problems with the model fit and may lead to biased / inaccurate parameter estimates.
As indicated by the banana-curve on the QQ plot, the residuals have a skewed distribution i.e the errors are not normally distributed. This could cause problems for models requiring normally distributed errors.
Based on the above mentioned weaknesses, it appears that the linear regression model may not be a good choice for modeling the data, despite being a statistically significant model for predicting the price of the property.
Based on the above weakness, the following improvements are recommended:
To improve the model’s fit and increase the R-squared value, we could consider adding more variables to the model, through transformation or interactions. This would help capture additional relationships in the data and increase the model’s explanatory power.
To address the issue of residuals having a large range, it may be necessary to perform further data cleaning to remove outliers and unusual values (such as mq being 1), and merging categories with lower frequency, for e.g. merging the observation with floors 4,5, and 6 into a single category.
To address the concerns regarding the skewed distribution for dependent variable price and independent variables such as mq, we could try further transforming them.
To address influential outliers and heteroscedasticity, we could try removing those outliers and use models that are less susceptible to outliers and non-constant variances.
Since the model fit of the linear regression model is low, the use of non-linear model such as polynomial regression etc. could potentially improve model fit and accuracy.
An analysis plan for modelling the likelihood of property being furnished is:
Model Selection: Since the dependent variable is binary, and the explanatory variables are a mix of categorical and numerical, we will use logistic regression.
Building Maximal Model: We’ll build a maximal model by including all the variables in our model as it allows us to determine the model’s maximum potential performance.
Evaluating Model significance: Once we fit the model, we check for the presence of non-significant variables (p-value < 0.05) in our model. If no non-significant variables are present, our model is significant.
Model Updation: Next, we’ll update the model to remove non-significant variables while retaining the significant ones. We use step() function for this as it updates the model in one go and then evaluate the performance of the significant model.
Performance Evaluation: From the model, we’ll calculate the chi-squared statistic and the predictor degree of freedom, which will give us the p-value, from which we can check if the model has a good fit or not (refer [@GLM]). Next, we’ll predict the likelihood of property being furnished, and generate the confusion matrix (refer [@ConfMatrix]) then evaluate model performance through metrics of accuracy, precision and recall.
Interpreting model coefficients: Finally, we’ll analyse the odds ratio to understand the effect of a unit change in the dependent variable on the odds of property being furnished.
In the final model below, the following observations are made:
The model has a relatively high residual deviance and a p-value of 0.99 (>0.05) generated from the chi-square statistic, indicating model is not good fit to the data.
The model has a very high accuracy of 92%, and the precision and recall are NaN (0/0) and 0 respectively.
The variable influencing is_furnished significantly is has_air_conditioning=1 (p-value < 0.05), while heating=other and has_terrace=1 have p-value>0.05.
From the odds ratio, we observe has_terrace=1 and has_air_conditioning=1 increase the likelihood of property being furnished by factors of 1.66 and 2.77 respectively, while heating=other decreases the likelihood by a factor of 0.36.
The issues in the model are:
The model does not fit the data very well as evident from the high residual deviance and chi-sq p-value. This indicates that the variables included in the model lack explanatory power in predicting is_furnished.
The precision and recall values indicate the model is not able to accurately classify the positive cases.
The p-value for the variables “heating=other” and “has_terrace=1” is greater than 0.05, indicating that they may not have a strong effect on is_furnished.
# Model building
furnished.max.model <- glm(formula = is_furnished ~ ., data = housedf_clean, family = "binomial")
# Summary of the model
summary(furnished.max.model)
##
## Call:
## glm(formula = is_furnished ~ ., family = "binomial", data = housedf_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3262 -0.4169 -0.3425 -0.2425 3.1676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.382e+00 5.709e-01 -5.924 3.14e-09 ***
## id -1.068e-06 2.001e-06 -0.533 0.5938
## price 1.240e-06 1.475e-06 0.841 0.4003
## mq -6.599e-04 1.876e-03 -0.352 0.7250
## floor2 -4.699e-01 2.969e-01 -1.583 0.1135
## floor3 -1.007e+00 5.433e-01 -1.853 0.0639 .
## floor4 4.797e-04 7.729e-01 0.001 0.9995
## floor5 4.257e-01 8.231e-01 0.517 0.6050
## floor6 1.297e+00 1.401e+00 0.926 0.3544
## floor7 -1.350e+01 6.271e+02 -0.022 0.9828
## n_rooms3 8.818e-01 5.098e-01 1.730 0.0837 .
## n_rooms4 9.132e-01 5.454e-01 1.674 0.0941 .
## n_rooms5 9.228e-01 6.001e-01 1.538 0.1241
## n_bathrooms2 -3.725e-02 2.965e-01 -0.126 0.9000
## n_bathrooms3 -1.412e+00 1.066e+00 -1.324 0.1855
## has_terrace1 3.825e-01 3.315e-01 1.154 0.2486
## has_alarm1 -2.238e-01 1.113e+00 -0.201 0.8407
## heatingother -1.048e+00 5.420e-01 -1.933 0.0532 .
## has_air_conditioning1 1.057e+00 2.627e-01 4.022 5.78e-05 ***
## has_parking1 -7.095e-01 1.071e+00 -0.663 0.5076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 502.28 on 902 degrees of freedom
## Residual deviance: 459.98 on 883 degrees of freedom
## AIC: 499.98
##
## Number of Fisher Scoring iterations: 14
# Updating model using step() to remove non-significant terms
furnished.model <- step(furnished.max.model)
## Start: AIC=499.98
## is_furnished ~ id + price + mq + floor + n_rooms + n_bathrooms +
## has_terrace + has_alarm + heating + has_air_conditioning +
## has_parking
##
## Df Deviance AIC
## - floor 6 468.41 496.41
## - n_rooms 3 463.85 497.85
## - has_alarm 1 460.02 498.02
## - mq 1 460.11 498.11
## - id 1 460.26 498.26
## - has_parking 1 460.50 498.50
## - n_bathrooms 2 462.58 498.58
## - price 1 460.67 498.67
## - has_terrace 1 461.25 499.25
## <none> 459.98 499.98
## - heating 1 464.79 502.79
## - has_air_conditioning 1 476.07 514.07
##
## Step: AIC=496.41
## is_furnished ~ id + price + mq + n_rooms + n_bathrooms + has_terrace +
## has_alarm + heating + has_air_conditioning + has_parking
##
## Df Deviance AIC
## - n_rooms 3 472.03 494.03
## - has_alarm 1 468.45 494.45
## - mq 1 468.60 494.60
## - id 1 468.64 494.64
## - has_parking 1 468.95 494.95
## - n_bathrooms 2 471.31 495.31
## - price 1 469.49 495.49
## - has_terrace 1 470.26 496.26
## <none> 468.41 496.41
## - heating 1 473.70 499.70
## - has_air_conditioning 1 484.48 510.48
##
## Step: AIC=494.03
## is_furnished ~ id + price + mq + n_bathrooms + has_terrace +
## has_alarm + heating + has_air_conditioning + has_parking
##
## Df Deviance AIC
## - mq 1 472.04 492.04
## - has_alarm 1 472.06 492.06
## - id 1 472.24 492.24
## - has_parking 1 472.59 492.59
## - n_bathrooms 2 474.90 492.90
## - price 1 473.37 493.37
## - has_terrace 1 473.92 493.92
## <none> 472.03 494.03
## - heating 1 477.31 497.31
## - has_air_conditioning 1 488.38 508.38
##
## Step: AIC=492.04
## is_furnished ~ id + price + n_bathrooms + has_terrace + has_alarm +
## heating + has_air_conditioning + has_parking
##
## Df Deviance AIC
## - has_alarm 1 472.07 490.07
## - id 1 472.25 490.25
## - has_parking 1 472.60 490.60
## - n_bathrooms 2 474.97 490.97
## - price 1 473.40 491.40
## - has_terrace 1 473.95 491.95
## <none> 472.04 492.04
## - heating 1 477.31 495.31
## - has_air_conditioning 1 488.44 506.44
##
## Step: AIC=490.07
## is_furnished ~ id + price + n_bathrooms + has_terrace + heating +
## has_air_conditioning + has_parking
##
## Df Deviance AIC
## - id 1 472.28 488.28
## - has_parking 1 472.63 488.63
## - n_bathrooms 2 475.02 489.02
## - price 1 473.40 489.40
## - has_terrace 1 474.00 490.00
## <none> 472.07 490.07
## - heating 1 477.36 493.36
## - has_air_conditioning 1 488.52 504.52
##
## Step: AIC=488.28
## is_furnished ~ price + n_bathrooms + has_terrace + heating +
## has_air_conditioning + has_parking
##
## Df Deviance AIC
## - has_parking 1 472.86 486.86
## - n_bathrooms 2 475.17 487.17
## - price 1 473.54 487.54
## - has_terrace 1 474.23 488.23
## <none> 472.28 488.28
## - heating 1 477.65 491.65
## - has_air_conditioning 1 488.75 502.75
##
## Step: AIC=486.86
## is_furnished ~ price + n_bathrooms + has_terrace + heating +
## has_air_conditioning
##
## Df Deviance AIC
## - n_bathrooms 2 475.69 485.69
## - price 1 474.11 486.11
## - has_terrace 1 474.68 486.68
## <none> 472.86 486.86
## - heating 1 478.39 490.39
## - has_air_conditioning 1 488.96 500.96
##
## Step: AIC=485.69
## is_furnished ~ price + has_terrace + heating + has_air_conditioning
##
## Df Deviance AIC
## - price 1 476.47 484.47
## <none> 475.69 485.69
## - has_terrace 1 477.78 485.78
## - heating 1 480.88 488.88
## - has_air_conditioning 1 491.67 499.67
##
## Step: AIC=484.47
## is_furnished ~ has_terrace + heating + has_air_conditioning
##
## Df Deviance AIC
## <none> 476.47 484.47
## - has_terrace 1 478.90 484.90
## - heating 1 481.43 487.43
## - has_air_conditioning 1 492.41 498.41
# Summary of the updated model
summary(furnished.model)
##
## Call:
## glm(formula = is_furnished ~ has_terrace + heating + has_air_conditioning,
## family = "binomial", data = housedf_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6807 -0.4239 -0.3320 -0.3320 2.7994
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8708 0.1880 -15.271 < 2e-16 ***
## has_terrace1 0.5067 0.3137 1.615 0.1063
## heatingother -1.0274 0.5285 -1.944 0.0519 .
## has_air_conditioning1 1.0199 0.2549 4.001 6.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 502.28 on 902 degrees of freedom
## Residual deviance: 476.47 on 899 degrees of freedom
## AIC: 484.47
##
## Number of Fisher Scoring iterations: 6
#plot(furnished.model)
# Predict the probability of property being furnished
housedf_clean$pfurnished <- predict(furnished.model, type = "response")
# Analysing pfurnished values
summary(housedf_clean$pfurnished)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01987 0.05361 0.05361 0.07973 0.13577 0.20682
# Calculating Chi-Square statistic for furnished model
chisq.furnished <- furnished.model$null.deviance - furnished.model$deviance
chisq.furnished
## [1] 25.81885
# Calculating predictor values degree of freedom
df.furnished <- furnished.model$df.null - furnished.model$df.residual
df.furnished
## [1] 3
# Calculating p-value from chi-square statistics
pchisq(chisq.furnished, df = df.furnished)
## [1] 0.9999896
# Transforming pfurnished to binary factors (1 if pfurnished > 0.5, 0 otherwise)
predictions_factor <- as.factor(ifelse(housedf_clean$pfurnished > 0.5, 1, 0))
# Call the confusionMatrix() function to generate the confusion matrix
confusion_matrix <- confusionMatrix(data = predictions_factor, reference = housedf_clean$is_furnished, positive = "1")
## Warning in confusionMatrix.default(data = predictions_factor, reference =
## housedf_clean$is_furnished, : Levels are not in the same order for reference and
## data. Refactoring data to match.
# Display the confusion matrix
confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 831 72
## 1 0 0
##
## Accuracy : 0.9203
## 95% CI : (0.9006, 0.9371)
## No Information Rate : 0.9203
## P-Value [Acc > NIR] : 0.5313
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.92027
## Prevalence : 0.07973
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
# Odd ratios
exp(coef(furnished.model))
## (Intercept) has_terrace1 heatingother
## 0.05665217 1.65977015 0.35793041
## has_air_conditioning1
## 2.77299760
[@r-charts] r-charts (2020). “Pie Chart Labels Outside with ggplot2.” Retrieved from https://r-charts.com/part-whole/pie-chart-labels-outside-ggplot2/
[@vif] VIF (2020). “vif: Variance Inflation Factors.” Retrieved from https://www.rdocumentation.org/packages/VIF/versions/1.0/topics/vif
[@GLM] Statology (2020). Interpret GLM Output in R. Retrieved from https://www.statology.org/interpret-glm-output-in-r/.
[@ConfMatrix] R Development Core Team (2020). caret: Classification and Regression Training. Retrieved from https://CRAN.R-project.org/package=caret